Validity
degree to which the data conform to defined business rules or constraints
Accuracy
degree to which the data is close to the true values. A valid person’s eye colour, say blue, might be valid, but not true (doesn’t represent the reality).
Completeness
degree to which all required data is known
Consistency
degree to which the data is consistent, within the same data set or across multiple data sets
Uniformity
degree to which the data is specified using the same unit of measure
1 Inspection
2 Explore cleaning
3 Define & verifying pipeline
4 Implementing & reporting
# load packages
import numpy as np, pandas as pd # for numeric/table data handling
import seaborn as sns # for visualizations
Original data source is this Machine Learning dataset: Bank Marketing Data Set. It describes data (bank client and other attributes) used for marketing campaigns of a Portuguese banking institution. Variable definitions are available there.
Here, an abridged and adjusted set offered on Ilias will be used for training reasons.
# Enable save and load to Google Drive
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
# Enable save and load to local machine
from google.colab import files
import io # for file handling
# Better dataframe layout in Google Colab
%load_ext google.colab.data_table
#%unload_ext google.colab.data_table # to turn it off again
# Read csv file from Google Drive into Pandas dataframe
infile = '/content/drive/My Drive/.../bank.csv' # insert path to folder with data file
with open(infile, 'r') as f:
df = pd.read_csv(f, encoding='UTF-8', sep=';')
df.head(5)
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 29.0 | admin. | single | university.degree | no | no | no | telephone | apr | fri | 48 | 2 | 999 | 0 | nonexistent | -1.8 | 93.075 | -47.1 | 1.405 | 5099.1 | no |
| 1 | 36.0 | admin. | married | university.degree | no | no | no | cellular | aug | thu | 126 | 2 | 999 | 0 | nonexistent | -2.9 | 92.201 | -31.4 | 0.873 | 5076.2 | no |
| 2 | 27.0 | admin. | married | university.degree | no | no | yes | cellular | jul | fri | 979 | 2 | 999 | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.962 | 5228.1 | yes |
| 3 | 43.0 | blue-collar | married | high.school | no | no | no | cellular | apr | fri | 997 | 1 | 999 | 1 | failure | -1.8 | 93.075 | -47.1 | 1.405 | 5099.1 | yes |
| 4 | 26.0 | technician | single | professional.course | no | unknown | unknown | cellular | jul | wed | 578 | 1 | 999 | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.957 | 5228.1 | yes |
# read csv file into Pandas dataframe from local folder
from os import chdir
#data_path = '' # insert path to folder with data file
#chdir(data_path)
df = pd.read_csv('bank.csv', sep=';')
df.head(5)
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 29.0 | admin. | single | university.degree | no | no | no | telephone | apr | fri | ... | 2 | 999 | 0 | nonexistent | -1.8 | 93.075 | -47.1 | 1.405 | 5099.1 | no |
| 1 | 36.0 | admin. | married | university.degree | no | no | no | cellular | aug | thu | ... | 2 | 999 | 0 | nonexistent | -2.9 | 92.201 | -31.4 | 0.873 | 5076.2 | no |
| 2 | 27.0 | admin. | married | university.degree | no | no | yes | cellular | jul | fri | ... | 2 | 999 | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.962 | 5228.1 | yes |
| 3 | 43.0 | blue-collar | married | high.school | no | no | no | cellular | apr | fri | ... | 1 | 999 | 1 | failure | -1.8 | 93.075 | -47.1 | 1.405 | 5099.1 | yes |
| 4 | 26.0 | technician | single | professional.course | no | unknown | unknown | cellular | jul | wed | ... | 1 | 999 | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.957 | 5228.1 | yes |
5 rows × 21 columns
First view on data
df.head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 29.0 | admin. | single | university.degree | no | no | no | telephone | apr | fri | ... | 2 | 999 | 0 | nonexistent | -1.8 | 93.075 | -47.1 | 1.405 | 5099.1 | no |
| 1 | 36.0 | admin. | married | university.degree | no | no | no | cellular | aug | thu | ... | 2 | 999 | 0 | nonexistent | -2.9 | 92.201 | -31.4 | 0.873 | 5076.2 | no |
| 2 | 27.0 | admin. | married | university.degree | no | no | yes | cellular | jul | fri | ... | 2 | 999 | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.962 | 5228.1 | yes |
| 3 | 43.0 | blue-collar | married | high.school | no | no | no | cellular | apr | fri | ... | 1 | 999 | 1 | failure | -1.8 | 93.075 | -47.1 | 1.405 | 5099.1 | yes |
| 4 | 26.0 | technician | single | professional.course | no | unknown | unknown | cellular | jul | wed | ... | 1 | 999 | 0 | nonexistent | 1.4 | 93.918 | -42.7 | 4.957 | 5228.1 | yes |
5 rows × 21 columns
# inspect a certain df section
print(df.iloc[38:43, 0:6])
age job marital education default housing 38 45.0 housemaid divorced basic.4y unknown no 39 34.0 management married university.degree no no 40 37.0 housemaid single university.degree no no 41 29.0 admin. single university.degree no yes 42 55.0 management married university.degree no yes
Validity: duplicates, datatypes, ranges, set-memberships, uniques, unknown/missing values
# duplicate rows
print(df[df.duplicated(subset=None, keep=False)==True]) # detects duplicate rows according to all columns
age job marital education default housing loan contact \
423 45.0 admin. married university.degree no no no cellular
4113 45.0 admin. married university.degree no no no cellular
month day_of_week ... campaign pdays previous poutcome \
423 jul thu ... 1 999 0 nonexistent
4113 jul thu ... 1 999 0 nonexistent
emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
423 -2.9 92.469 -33.6 1.072 5076.2 yes
4113 -2.9 92.469 -33.6 1.072 5076.2 yes
[2 rows x 21 columns]
# datatypes
print(df.dtypes)
age float64 job object marital object education object default object housing object loan object contact object month object day_of_week object duration int64 campaign int64 pdays int64 previous int64 poutcome object emp.var.rate float64 cons.price.idx float64 cons.conf.idx float64 euribor3m float64 nr.employed float64 y object dtype: object
# numeric variable: range (and more)
print(df['age'].describe())
count 10324.000000 mean 40.359938 std 11.800919 min 17.000000 25% 31.000000 50% 38.000000 75% 48.000000 max 98.000000 Name: age, dtype: float64
# categorical variable: set-membership/levels with occurences
print(df.education.dtype)
print('----')
print(df.education.value_counts()) # Note: without NA
object ---- university.degree 3284 high.school 2358 professional.course 1374 basic.9y 1321 basic.4y 1015 basic.6y 528 unknown 479 illiterate 6 Name: education, dtype: int64
# unique values - with NAs
print(df.marital.unique()) # categorical
print(df.pdays.unique()) # numeric
print('----')
print(len(df.age.unique())==len(df.age)) # check, wether all values in a column are unique
['single' 'married' 'divorced' 'unknown' nan] [999 6 3 0 8 18 7 4 2 5 12 11 9 10 13 15 16 26 1 14 22 17 27 25 19 21] ---- False
# missing values
print(df.isnull().sum().sum()) # no of missing values overall, detects NaN in numeric, None or NaN in object, NaT in datetime
print('----')
print(df.shape[0] - df.dropna().shape[0]) # no of rows with missing values
print('----')
print(df.isnull().sum()) # missing values per columns
143 ---- 141 ---- age 41 job 0 marital 102 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 dtype: int64
# dataframe shape, datatypes, no of non-missing values
print(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10365 entries, 0 to 10364 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 10324 non-null float64 1 job 10365 non-null object 2 marital 10263 non-null object 3 education 10365 non-null object 4 default 10365 non-null object 5 housing 10365 non-null object 6 loan 10365 non-null object 7 contact 10365 non-null object 8 month 10365 non-null object 9 day_of_week 10365 non-null object 10 duration 10365 non-null int64 11 campaign 10365 non-null int64 12 pdays 10365 non-null int64 13 previous 10365 non-null int64 14 poutcome 10365 non-null object 15 emp.var.rate 10365 non-null float64 16 cons.price.idx 10365 non-null float64 17 cons.conf.idx 10365 non-null float64 18 euribor3m 10365 non-null float64 19 nr.employed 10365 non-null float64 20 y 10365 non-null object dtypes: float64(6), int64(4), object(11) memory usage: 1.7+ MB None
# basic statistics for numeric variables
df.describe().round(1) # add '.style' for nice view on local machine
| age | duration | campaign | pdays | previous | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 10324.0 | 10365.0 | 10365.0 | 10365.0 | 10365.0 | 10365.0 | 10365.0 | 10365.0 | 10365.0 | 10365.0 |
| mean | 40.4 | 369.9 | 2.4 | 899.8 | 0.3 | -0.4 | 93.5 | -40.2 | 3.1 | 5140.3 |
| std | 11.8 | 349.5 | 2.5 | 297.8 | 0.7 | 1.7 | 0.6 | 5.3 | 1.9 | 85.8 |
| min | 17.0 | 0.0 | 1.0 | 0.0 | 0.0 | -3.4 | 92.2 | -50.8 | 0.6 | 4963.6 |
| 25% | 31.0 | 136.0 | 1.0 | 999.0 | 0.0 | -1.8 | 93.0 | -42.7 | 1.3 | 5076.2 |
| 50% | 38.0 | 252.0 | 2.0 | 999.0 | 0.0 | -0.1 | 93.4 | -41.8 | 4.1 | 5191.0 |
| 75% | 48.0 | 493.0 | 3.0 | 999.0 | 0.0 | 1.4 | 94.0 | -36.4 | 5.0 | 5228.1 |
| max | 98.0 | 4199.0 | 43.0 | 999.0 | 6.0 | 1.4 | 94.8 | -26.9 | 5.0 | 5228.1 |
# basic statistics for non-numeric variables
df.describe(include = 'object') # add '.style' for nice view on local machine
| job | marital | education | default | housing | loan | contact | month | day_of_week | poutcome | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10365 | 10263 | 10365 | 10365 | 10365 | 10365 | 10365 | 10365 | 10365 | 10365 | 10365 |
| unique | 12 | 4 | 8 | 2 | 3 | 3 | 2 | 10 | 5 | 3 | 2 |
| top | admin. | married | university.degree | no | yes | no | cellular | may | thu | nonexistent | no |
| freq | 2761 | 5973 | 3284 | 8658 | 5468 | 8603 | 7308 | 2912 | 2257 | 8259 | 5725 |
# crosstab of categorical variables
pd.crosstab(df.marital, df.education)
| education | basic.4y | basic.6y | basic.9y | high.school | illiterate | professional.course | university.degree | unknown |
|---|---|---|---|---|---|---|---|---|
| marital | ||||||||
| divorced | 141 | 47 | 114 | 265 | 1 | 175 | 337 | 41 |
| married | 770 | 399 | 866 | 1203 | 4 | 802 | 1660 | 269 |
| single | 94 | 77 | 332 | 855 | 1 | 382 | 1247 | 161 |
| unknown | 2 | 1 | 2 | 2 | 0 | 0 | 10 | 3 |
# also works with numeric variables
pd.crosstab(df.marital, df.age)
| age | 17.0 | 18.0 | 19.0 | 20.0 | 21.0 | 22.0 | 23.0 | 24.0 | 25.0 | 26.0 | ... | 83.0 | 84.0 | 85.0 | 86.0 | 87.0 | 88.0 | 89.0 | 92.0 | 94.0 | 98.0 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| marital | |||||||||||||||||||||
| divorced | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 5 | 3 | ... | 5 | 2 | 2 | 0 | 1 | 9 | 2 | 1 | 0 | 0 |
| married | 0 | 0 | 0 | 0 | 4 | 0 | 6 | 24 | 33 | 49 | ... | 4 | 1 | 6 | 4 | 0 | 0 | 0 | 2 | 1 | 2 |
| single | 3 | 13 | 24 | 31 | 35 | 46 | 69 | 130 | 118 | 151 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| unknown | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 rows × 76 columns
# grouped basic statistics
df.groupby(['marital','housing']).size()
marital housing
divorced no 495
unknown 31
yes 595
married no 2697
unknown 148
yes 3128
single no 1389
unknown 74
yes 1686
unknown no 12
yes 8
dtype: int64
# grouped basic statistics of one variable
df.age.groupby(df.marital).mean()
marital divorced 46.746416 married 43.347278 single 32.410714 unknown 39.700000 Name: age, dtype: float64
# multi grouping (= according to 2 or more variables)
df.groupby(['marital','housing']).mean().round(0)
| age | duration | campaign | pdays | previous | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | ||
|---|---|---|---|---|---|---|---|---|---|---|---|
| marital | housing | ||||||||||
| divorced | no | 47.0 | 370.0 | 2.0 | 911.0 | 0.0 | -0.0 | 94.0 | -40.0 | 3.0 | 5148.0 |
| unknown | 45.0 | 264.0 | 4.0 | 967.0 | 0.0 | 0.0 | 94.0 | -40.0 | 3.0 | 5159.0 | |
| yes | 47.0 | 367.0 | 2.0 | 906.0 | 0.0 | -0.0 | 94.0 | -41.0 | 3.0 | 5142.0 | |
| married | no | 43.0 | 377.0 | 2.0 | 913.0 | 0.0 | -0.0 | 94.0 | -40.0 | 3.0 | 5150.0 |
| unknown | 44.0 | 364.0 | 2.0 | 912.0 | 0.0 | -0.0 | 94.0 | -40.0 | 3.0 | 5142.0 | |
| yes | 43.0 | 367.0 | 2.0 | 905.0 | 0.0 | -0.0 | 93.0 | -40.0 | 3.0 | 5145.0 | |
| single | no | 33.0 | 369.0 | 2.0 | 889.0 | 0.0 | -1.0 | 93.0 | -40.0 | 3.0 | 5129.0 |
| unknown | 33.0 | 362.0 | 2.0 | 851.0 | 0.0 | -1.0 | 94.0 | -41.0 | 3.0 | 5114.0 | |
| yes | 32.0 | 367.0 | 2.0 | 869.0 | 0.0 | -1.0 | 93.0 | -41.0 | 3.0 | 5122.0 | |
| unknown | no | 38.0 | 521.0 | 4.0 | 751.0 | 0.0 | -1.0 | 93.0 | -38.0 | 3.0 | 5123.0 |
| yes | 42.0 | 452.0 | 3.0 | 999.0 | 0.0 | -0.0 | 93.0 | -40.0 | 4.0 | 5174.0 |
# Pearson correlation
print(df.loc[:, ['age', 'previous']].corr())
print('----')
df.select_dtypes(exclude=['object']).corr().style
age previous age 1.000000 0.062547 previous 0.062547 1.000000 ----
| age | duration | campaign | pdays | previous | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 1.000000 | -0.018313 | -0.004775 | -0.063010 | 0.062547 | -0.045142 | -0.013744 | 0.136599 | -0.042529 | -0.072754 |
| duration | -0.018313 | 1.000000 | -0.035912 | 0.009933 | -0.025773 | 0.036029 | 0.037336 | -0.049038 | 0.022456 | 0.024367 |
| campaign | -0.004775 | -0.035912 | 1.000000 | 0.088615 | -0.098749 | 0.192897 | 0.136358 | -0.031329 | 0.180767 | 0.183130 |
| pdays | -0.063010 | 0.009933 | 0.088615 | 1.000000 | -0.704132 | 0.332958 | 0.045204 | -0.154077 | 0.381548 | 0.466952 |
| previous | 0.062547 | -0.025773 | -0.098749 | -0.704132 | 1.000000 | -0.393684 | -0.079791 | 0.073198 | -0.456187 | -0.528477 |
| emp.var.rate | -0.045142 | 0.036029 | 0.192897 | 0.332958 | -0.393684 | 1.000000 | 0.728579 | -0.035052 | 0.961152 | 0.874719 |
| cons.price.idx | -0.013744 | 0.037336 | 0.136358 | 0.045204 | -0.079791 | 0.728579 | 1.000000 | -0.133359 | 0.589408 | 0.374915 |
| cons.conf.idx | 0.136599 | -0.049038 | -0.031329 | -0.154077 | 0.073198 | -0.035052 | -0.133359 | 1.000000 | 0.067519 | -0.071665 |
| euribor3m | -0.042529 | 0.022456 | 0.180767 | 0.381548 | -0.456187 | 0.961152 | 0.589408 | 0.067519 | 1.000000 | 0.941777 |
| nr.employed | -0.072754 | 0.024367 | 0.183130 | 0.466952 | -0.528477 | 0.874719 | 0.374915 | -0.071665 | 0.941777 | 1.000000 |
# categorical: Bar chart
df.marital.value_counts().plot(kind='bar')
<AxesSubplot:>
#
df.job.value_counts().plot(kind='pie')
<AxesSubplot:ylabel='job'>
# Histogram
df.age.plot(kind='hist', bins=20)
<AxesSubplot:ylabel='Frequency'>
# Box-whiskers plot
df.age.plot(kind='box', vert=False)
<AxesSubplot:>
# Density plot
df.age.plot(kind='density')
<AxesSubplot:ylabel='Density'>
ax = sns.scatterplot(x='age', y='duration', data=df, hue='y')
# Numeric: Pair plot (= scatter plot matrix, here with 'marital' groups)
tmp_df = df.iloc[:, 0:15]
sns.pairplot(tmp_df, hue='marital')
<seaborn.axisgrid.PairGrid at 0x21baf1ece08>
# numeric & categorical: Grouped boxplot
df.boxplot(column=['age'], by='marital', vert=False)
<AxesSubplot:title={'center':'age'}, xlabel='marital'>
# grouped boxplot (seaborn)
sns.boxplot(y="marital", x="duration", hue="y", data=df, orient="h", palette="Set3")
<AxesSubplot:xlabel='duration', ylabel='marital'>
# compound boxplot for all numeric values in dataframe
tmp_df = df.drop(['nr.employed', 'duration', 'pdays'], axis=1)
sns.boxplot(data=tmp_df, orient="h", palette="Set3")
del (tmp_df)
# categorical & categorical: Stacked barchart
df.groupby(['marital', 'housing']).size().unstack().plot(kind='bar',stacked=True)
<AxesSubplot:xlabel='marital'>
# numerics' correlations as heatmap
corr = df.select_dtypes(exclude=['object']).corr()
ax = sns.heatmap(corr, annot=True, fmt='.2f', xticklabels=corr.columns, yticklabels=corr.columns,
cmap=sns.diverging_palette(10, 220, as_cmap=True))
# categoricals' crosstab visualized as heatmap
ct = pd.crosstab( df['job'], df['marital']) # make crosstab
sns.heatmap(ct, annot=True, fmt='d')
<AxesSubplot:xlabel='marital', ylabel='job'>
# install latest version of pandas-profiling directly from Github
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
Collecting https://github.com/pandas-profiling/pandas-profiling/archive/master.zip Using cached https://github.com/pandas-profiling/pandas-profiling/archive/master.zip Requirement already satisfied: joblib in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (1.0.1) Requirement already satisfied: scipy>=1.4.1 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (1.6.2) Requirement already satisfied: pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (1.2.4) Requirement already satisfied: matplotlib>=3.2.0 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (3.3.4) Collecting confuse>=1.0.0 Downloading confuse-1.4.0-py2.py3-none-any.whl (21 kB) Requirement already satisfied: jinja2>=2.11.1 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (2.11.3) Requirement already satisfied: visions[type_image_path]==0.6.0 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (0.6.0) Requirement already satisfied: numpy>=1.16.0 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (1.19.2) Requirement already satisfied: attrs>=19.3.0 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (20.3.0) Collecting htmlmin>=0.1.12 Downloading htmlmin-0.1.12.tar.gz (19 kB) Collecting missingno>=0.4.2 Downloading missingno-0.4.2-py3-none-any.whl (9.7 kB) Collecting phik>=0.10.0 Downloading phik-0.11.2.tar.gz (1.1 MB) Collecting tangled-up-in-unicode>=0.0.6 Downloading tangled_up_in_unicode-0.0.7-py3-none-any.whl (3.1 MB) Requirement already satisfied: requests>=2.24.0 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (2.25.1) Requirement already satisfied: tqdm>=4.48.2 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (4.59.0) Requirement already satisfied: seaborn>=0.10.1 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas-profiling==2.12.0) (0.11.1) Requirement already satisfied: networkx>=2.4 in c:\users\kwoit\anaconda3\lib\site-packages (from visions[type_image_path]==0.6.0->pandas-profiling==2.12.0) (2.5.1) Requirement already satisfied: Pillow in c:\users\kwoit\anaconda3\lib\site-packages (from visions[type_image_path]==0.6.0->pandas-profiling==2.12.0) (8.2.0) Collecting imagehash Downloading ImageHash-4.2.0-py2.py3-none-any.whl (295 kB) Requirement already satisfied: pyyaml in c:\users\kwoit\anaconda3\lib\site-packages (from confuse>=1.0.0->pandas-profiling==2.12.0) (5.4.1) Requirement already satisfied: MarkupSafe>=0.23 in c:\users\kwoit\anaconda3\lib\site-packages (from jinja2>=2.11.1->pandas-profiling==2.12.0) (1.1.1) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\kwoit\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.12.0) (1.3.1) Requirement already satisfied: python-dateutil>=2.1 in c:\users\kwoit\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.12.0) (2.8.1) Requirement already satisfied: cycler>=0.10 in c:\users\kwoit\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.12.0) (0.10.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\kwoit\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.12.0) (2.4.7) Requirement already satisfied: six in c:\users\kwoit\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib>=3.2.0->pandas-profiling==2.12.0) (1.15.0) Requirement already satisfied: decorator<5,>=4.3 in c:\users\kwoit\anaconda3\lib\site-packages (from networkx>=2.4->visions[type_image_path]==0.6.0->pandas-profiling==2.12.0) (4.4.2) Requirement already satisfied: pytz>=2017.3 in c:\users\kwoit\anaconda3\lib\site-packages (from pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3->pandas-profiling==2.12.0) (2021.1) Requirement already satisfied: chardet<5,>=3.0.2 in c:\users\kwoit\anaconda3\lib\site-packages (from requests>=2.24.0->pandas-profiling==2.12.0) (4.0.0) Requirement already satisfied: certifi>=2017.4.17 in c:\users\kwoit\anaconda3\lib\site-packages (from requests>=2.24.0->pandas-profiling==2.12.0) (2020.12.5) Requirement already satisfied: idna<3,>=2.5 in c:\users\kwoit\anaconda3\lib\site-packages (from requests>=2.24.0->pandas-profiling==2.12.0) (2.10) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\kwoit\anaconda3\lib\site-packages (from requests>=2.24.0->pandas-profiling==2.12.0) (1.26.4) Collecting PyWavelets Downloading PyWavelets-1.1.1-cp37-cp37m-win_amd64.whl (4.2 MB) Building wheels for collected packages: htmlmin, phik Building wheel for htmlmin (setup.py): started Building wheel for htmlmin (setup.py): finished with status 'done' Created wheel for htmlmin: filename=htmlmin-0.1.12-py3-none-any.whl size=27085 sha256=32554d5564a200e46f0a9bea623da0d015f1c41eda592ab5208e18764e28091f Stored in directory: c:\users\kwoit\appdata\local\pip\cache\wheels\70\e1\52\5b14d250ba868768823940c3229e9950d201a26d0bd3ee8655 Building wheel for phik (setup.py): started Building wheel for phik (setup.py): finished with status 'done' Created wheel for phik: filename=phik-0.11.2-py3-none-any.whl size=1107415 sha256=288b736f082bed9dd2ed560aad08c533aef95af1ca019d3940f7801666d5a573 Stored in directory: c:\users\kwoit\appdata\local\pip\cache\wheels\40\98\a3\b654f24edcdcdb87d1f70d65a506fcfdf15289db129c594bcd Successfully built htmlmin phik Installing collected packages: tangled-up-in-unicode, PyWavelets, imagehash, phik, missingno, htmlmin, confuse Successfully installed PyWavelets-1.1.1 confuse-1.4.0 htmlmin-0.1.12 imagehash-4.2.0 missingno-0.4.2 phik-0.11.2 tangled-up-in-unicode-0.0.7
# load packages
from ipywidgets import widgets
from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file
profile = ProfileReport(df, title="Test Dataset")
profile.to_notebook_iframe() # first rendering in iframe
profile.to_widgets() # second rendering, runs awhile
# remove whole columns
tmp_df = df.drop(['y', 'euribor3m'], axis=1)
print(tmp_df.columns)
del (tmp_df)
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'nr.employed'],
dtype='object')
# remove rows due to a constraint, e.g. a certain column value
tmp_df = df[(df['housing'] != 'unknown')]
print(tmp_df.shape)
del (tmp_df)
(10110, 21)
# change datatype
print(df.age.dtype)
tmp_df = df.dropna(subset=['age']).astype({'age': 'int64'}) # first: let out rows with NAs in 'age', then change datatype of 'age'
print(tmp_df.age.dtype)
del (tmp_df)
float64 int64
# unique values
print(df.job.unique())
print('----')
print(df.job.value_counts())
['admin.' 'blue-collar' 'technician' 'housemaid' 'retired' 'unemployed' 'services' 'student' 'unknown' 'management' 'self-employed' 'entrepreneur'] ---- admin. 2761 blue-collar 1966 technician 1704 services 914 management 720 retired 633 student 361 self-employed 360 entrepreneur 312 unemployed 281 housemaid 277 unknown 76 Name: job, dtype: int64
# Replace a unique value ('entrepreneur') with another('self-employed')
tmp_df = df.copy()
tmp_df.job = tmp_df.job.replace('entrepreneur', 'self-employed')
print(tmp_df.job.value_counts())
del (tmp_df)
admin. 2761 blue-collar 1966 technician 1704 services 914 management 720 self-employed 672 retired 633 student 361 unemployed 281 housemaid 277 unknown 76 Name: job, dtype: int64
Imputing can conveniently done with scikit-learn.impute. Available are:
Of course, imputing can be done manually with Pandas, too.
# NAs in 'marital'
print(df.marital.unique())
print('----')
print(df.marital.isna().sum())
['single' 'married' 'divorced' 'unknown' nan] ---- 102
# Categorical: fill NA with mode values, manually
tmp_df = df.copy()
tmp_df.marital.fillna((tmp_df.marital.mode()[0]), inplace=True)
print(df.marital.value_counts())
print('----')
print(tmp_df.marital.value_counts())
del (tmp_df)
married 5973 single 3149 divorced 1121 unknown 20 Name: marital, dtype: int64 ---- married 6075 single 3149 divorced 1121 unknown 20 Name: marital, dtype: int64
df.age.isna().sum()
41
# Numeric: fill NA with mean values, manually
tmp_df = df.copy()
tmp_df['age'].fillna(tmp_df['age'].mean(), inplace=True)
print(tmp_df.age.isna().sum())
del (tmp_df)
0
# Categorical: fill NA with mode values, with scikit-learn Simple Imputer
# here: only one certain column to impute
from sklearn.impute import SimpleImputer
tmp_df = df.copy()
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent") # implement imputer
tmp_df['marital'] = imp_mode.fit_transform(tmp_df['marital'].values.reshape(-1,1)) # only 'marital' column
print(tmp_df['marital'].value_counts())
del (tmp_df)
married 6075 single 3149 divorced 1121 unknown 20 Name: marital, dtype: int64
# Numeric: fill NA with mean values, with Scikit-learn Simple Imputer
# here: treat ALL numeric columns with NAs
from sklearn.impute import SimpleImputer
tmp_df = df.copy().select_dtypes(exclude='object') # select ALL numeric columns for imputing
print(tmp_df.info())
print(tmp_df.age.describe())
print('----')
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
tmp_df = pd.DataFrame(imp_mean.fit_transform(tmp_df))
print(tmp_df.info())
print(tmp_df.iloc[:,0].describe()) # mean is the same, standard deviation has changed
del (tmp_df)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10365 entries, 0 to 10364 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 10324 non-null float64 1 duration 10365 non-null int64 2 campaign 10365 non-null int64 3 pdays 10365 non-null int64 4 previous 10365 non-null int64 5 emp.var.rate 10365 non-null float64 6 cons.price.idx 10365 non-null float64 7 cons.conf.idx 10365 non-null float64 8 euribor3m 10365 non-null float64 9 nr.employed 10365 non-null float64 dtypes: float64(6), int64(4) memory usage: 809.9 KB None count 10324.000000 mean 40.359938 std 11.800919 min 17.000000 25% 31.000000 50% 38.000000 75% 48.000000 max 98.000000 Name: age, dtype: float64 ---- <class 'pandas.core.frame.DataFrame'> RangeIndex: 10365 entries, 0 to 10364 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 0 10365 non-null float64 1 1 10365 non-null float64 2 2 10365 non-null float64 3 3 10365 non-null float64 4 4 10365 non-null float64 5 5 10365 non-null float64 6 6 10365 non-null float64 7 7 10365 non-null float64 8 8 10365 non-null float64 9 9 10365 non-null float64 dtypes: float64(10) memory usage: 809.9 KB None count 10365.000000 mean 40.359938 std 11.777554 min 17.000000 25% 31.000000 50% 38.000000 75% 48.000000 max 98.000000 Name: 0, dtype: float64
# Recode nominals (one-hot encoding) via dummy variables, manually
tmp_df = df.copy()
tmp_df = pd.get_dummies(tmp_df, columns=['marital'],drop_first=True) # marital 'divorced' will be omitted
print(tmp_df.columns)
del (tmp_df)
Index(['age', 'job', 'education', 'default', 'housing', 'loan', 'contact',
'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous',
'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
'euribor3m', 'nr.employed', 'y', 'marital_married', 'marital_single',
'marital_unknown'],
dtype='object')
# Recoding ordinals to numeric levels, manually
tmp_df = df.copy()
replace_nums = {
'education' : {
'illiterate' : 0,
'unknown' : 0, # possible decision: unite level 'unknown' with 'illiterate'
'basic.4y' : 1,
'basic.6y' : 2,
'basic.9y' : 3,
'professional.course' : 4,
'high.school' : 5,
'university.degree' : 6}}
tmp_df.replace(replace_nums, inplace=True)
print(df.education.value_counts())
print('----')
print(tmp_df.education.value_counts())
del (tmp_df)
university.degree 3284 high.school 2358 professional.course 1374 basic.9y 1321 basic.4y 1015 basic.6y 528 unknown 479 illiterate 6 Name: education, dtype: int64 ---- 6 3284 5 2358 4 1374 3 1321 1 1015 2 528 0 485 Name: education, dtype: int64
# Recode ordinals with OrdinalEnccoder (orders levels alphabetically)
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder() # instatiate encoder
tmp_df = df.copy()
print(tmp_df['education'][0:6])
tmp_df['education'] = enc.fit_transform(tmp_df['education'].values.reshape(-1,1))
print(tmp_df['education'][0:6])
print('----')
print(enc.categories_) # levels automatically in alphabetical order
del (tmp_df)
0 university.degree
1 university.degree
2 university.degree
3 high.school
4 professional.course
5 basic.4y
Name: education, dtype: object
0 6.0
1 6.0
2 6.0
3 3.0
4 5.0
5 0.0
Name: education, dtype: float64
----
[array(['basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate',
'professional.course', 'university.degree', 'unknown'],
dtype=object)]
# Standardize a certain column
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # instantiate scaler
tmp_df = df.copy()
print(tmp_df['cons.conf.idx'][0:6])
tmp_df['cons.conf.idx'] = scaler.fit_transform(tmp_df['cons.conf.idx'].values.reshape(-1,1)) # scale
print(tmp_df['cons.conf.idx'][0:6])
del (tmp_df)
0 -47.1 1 -31.4 2 -42.7 3 -47.1 4 -42.7 5 -46.2 Name: cons.conf.idx, dtype: float64 0 -1.304175 1 1.683279 2 -0.466927 3 -1.304175 4 -0.466927 5 -1.132920 Name: cons.conf.idx, dtype: float64
# Standardize 2 and more columns at once
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # instantiate scaler
tmp_df = df.copy()
print(tmp_df[['cons.conf.idx', 'cons.price.idx']][0:6])
tmp_df[['cons.conf.idx', 'cons.price.idx']] = scaler.fit_transform(tmp_df[['cons.conf.idx', 'cons.price.idx']])
print(tmp_df[['cons.conf.idx', 'cons.price.idx']][0:6])
del (tmp_df)
cons.conf.idx cons.price.idx 0 -47.1 93.075 1 -31.4 92.201 2 -42.7 93.918 3 -47.1 93.075 4 -42.7 93.918 5 -46.2 92.893 cons.conf.idx cons.price.idx 0 -1.304175 -0.668983 1 1.683279 -2.062256 2 -0.466927 0.674873 3 -1.304175 -0.668983 4 -0.466927 0.674873 5 -1.132920 -0.959115
#data_path = '' # insert path where file should be saved
#chdir(data_path)
df.to_csv('bank_preprocessed.csv', sep=';')